####################################################################################
####### Creating data sets

# This code is not replicable because it uses raw data with identifiable information

# Initial part of the code (up through line 760) 
#   Creates the main data set (survey data + admin)
#   Creates subsets of the main set used in each paper
#   Creates anonymous (public) versions of these data sets

# Second part of the code 
#   Creates datasets for the analysis of attrition

# Attention: This is running with the auxiliary functions in the Replication-Making-JoP folder

#This code was revised on October 12 2022 to recode the ipw variable

####################################################################################

rm(list=ls())
options(scipen = 999) #disabling sci notation

library(tidyverse)
library(readxl)
library(data.table)
library(estimatr)
library(randomizr)
library(genderBR)
library(lubridate)
library(mice)

home <- "~/Dropbox/MCMV_replication/"

source("~/Dropbox/Replication-Making-JoP/Code/functions.R") 

#Administrative data on beneficiaries 
  far <- read_excel("~/Dropbox/MCMV_replication/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 1,
                    skip = 1)
  pnhu <- read_excel("~/Dropbox/MCMV_replication/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 3,
                     skip = 1)
  fds <- read_excel("~/Dropbox/MCMV_replication/DB/BENEFICIARIOSMCMV122017.xlsx", sheet = 2,
                    skip = 1)
 
#Combine and add column about type
  fds <- fds %>% mutate(type = rep("fds", rep(nrow(fds))))
  far <- far %>% mutate(type = rep("far", rep(nrow(far))))
  pnhu  <- pnhu %>% mutate(type = rep("pnhu", rep(nrow(pnhu))))
 
  far <- far %>% mutate(Numero_PIS_do_Mutuario = as.character(Numero_PIS_do_Mutuario))
  mcmv <- far %>% bind_rows(pnhu, fds)
  rj <- mcmv %>% filter(UF == "RJ" & Nome_Municipio == "RIO DE JANEIRO")
  table(nchar(rj$Numero_CPF_Mutuario))
  rj$cpfstd <- sprintf("%011.0f", rj$Numero_CPF_Mutuario)#From numeric to character with fixed number of digits
save(rj, file = "~/Dropbox/MCMV_replication/DB/rj_beneficiaries.Rda")

#Import administrative data
load(paste0(home, "DB/rj_beneficiaries.Rda"))
load(paste0(home, "DB/all_wide_rais_october.Rda"))

# Import survey data
bd <- read_csv(paste0(home, "DB/fgv_mcmv_campo.csv"), 
               col_types = cols(q90_anotar = col_character(), 
                                envio_whatsapp = col_character(), 
                                q25_b_new_other = col_character(), 
                                q82_new = col_character(), 
                                q83_new = col_character(), 
                                wpp = col_character(), 
                                assigned_to = col_character(), 
                                q89_new = col_character(), 
               status_outros_detalhamento = col_character(), 
               dados_novo = col_character()))

#Number of rows
n_i <- nrow(bd)

# Import fieldID data (personal identifiers data)
load(paste0(home, "DB/fieldIDs.RData"))

#Joining survey and fieldID
bd <- bd %>% mutate(fieldID = fieldid) %>% left_join(all, by = "fieldID")
nrow(bd)==n_i #smell check
table(is.na(bd$cpfstd)) #smell check
bd <- bd %>% filter(!is.na(cpfstd))

#REMOVING HIGH EFFORT DATA COLLECTION (from 05-14 to 06-01)
int_he <- interval(ymd("2020-05-24"), ymd("2020-06-01")) #high-effort interviews
bd <- bd %>% mutate(dates = ymd(data_de_campo),
                    he = ifelse(dates %within%  int_he, 1, 0)) 
bd <- bd %>% filter(he != 1) #exclude interviews conducted in high-effort period. 
stopifnot(nrow(bd) == 42965 -19 - 252) #removing NA CPF and High-Effort

########## Removing subject who withdrew consent

bd <- bd %>% filter(!codigo_listagem == 8945)

# Import full sample data (with treatment indicator)

#Importing sample 1 of round 2
load(paste0(home, "DB/data_sample_W2.1.RData"))
table(W2.1$TT)
length(unique(W2.1$cpfstd))==nrow(W2.1) #CPFs repeat because of participation in multiple lotteries
#Getting TT as character
W2.1 <- W2.1 %>% mutate(TT = as.character(TT))

#Importing sample 2 of round 2
load(paste0(home, "DB/data_sample_W2.2.RData"))
table(W2.2$TT)
length(unique(W2.2$cpfstd))==nrow(W2.2) 
#Getting TT as character
W2.2 <- W2.2 %>% mutate(TT = as.character(TT))

#Combining both samples
W2.1 <- W2.1 %>% mutate(sample = rep(1, nrow(W2.1))) %>% select(-cpf, -DDD1_ENR, -FONE1_ENR, -DDD2_ENR, 
                                                                -FONE2_ENR, -DDD3_ENR, -FONE3_ENR, - STATUS_ENR, 
                                                                -dup)
W2.2 <- W2.2 %>% mutate(sample = rep(2, nrow(W2.2)))

W2 <- bind_rows(W2.1, W2.2)

############ Creating dataset for analysis

ans <- bd %>% mutate(answered = ifelse(b2 == 1, 1, 0), 
                     answered2 = ifelse(c1 == 1, 1, 0)) #getting complete answers

answered <- ans %>% filter(answered == 1 | answered2 == 1)

### Read the "sorteados" file 
load(file=paste0(home, "DB/sorteados_october2018.Rda"))
### Read the "inscritos" file 
load(file=paste0(home, "DB/inscritos_october2018.Rda"))

cpfs_interviewed <- unique(answered$cpfstd)

#Select lotteries inscritos
interviewed_inscritos <- inscritosf %>% filter(cpfstd %in% cpfs_interviewed)
stopifnot(length(unique(interviewed_inscritos$cpfstd))==length(cpfs_interviewed))
interviewed_inscritos <- interviewed_inscritos %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))

#smell check
table(interviewed_inscritos$id_edital) 
#individuals repeated within edital== Error in the original data
rep <- interviewed_inscritos %>% group_by(cpfstd, id_edital) %>% count() %>% filter(n > 1)
#Fixing it 
interviewed_inscritos <- interviewed_inscritos[-which(interviewed_inscritos$cpfstd == "00462724700" & 
                                                interviewed_inscritos$names == "LUSINETE BENVINDO DE AZEVEDO" &
                                                interviewed_inscritos$id_edital == "edital03.2011"),]
rep <- interviewed_inscritos %>% group_by(cpfstd, id_edital) %>% count() %>% filter(n > 1)

#Selecting lotteries winners in the survey
interviewed_winners <- sorteadosf %>% filter(cpfstd %in% cpfs_interviewed)
interviewed_winners <- interviewed_winners %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))
table(interviewed_winners$id_edital) #smell check
rep <- interviewed_winners %>% group_by(cpfstd, id_edital) %>% count() %>% filter(n > 1)


#Creating TT labels
interviewed_winners <- interviewed_winners %>% mutate(TT1 = case_when(id_edital == "edital03.2011" ~ "T032011", 
                                                            id_edital == "edital06.2011" ~ "T062011")) %>% select(cpfstd, names, TT1, id_edital)

interviewed_inscritos2 <-  left_join(interviewed_inscritos, interviewed_winners, by = c("cpfstd", "id_edital"))
nrow(interviewed_inscritos2) == nrow(interviewed_inscritos) 

interviewed_inscritos2 <- interviewed_inscritos2 %>% mutate(TT = case_when(id_edital =="edital03.2011" & is.na(TT1) ~ "C032011", 
                                                                           id_edital =="edital06.2011" & is.na(TT1) ~ "C062011", 
                                                                           id_edital =="edital03.2011" & !is.na(TT1) ~ "T032011", 
                                                                           id_edital =="edital06.2011" & !is.na(TT1) ~ "T062011"))
table(interviewed_inscritos2$TT)

#Stacking for individual-lottery format
data_ind_lottery <- interviewed_inscritos2
stopifnot(length(unique(data_ind_lottery$cpfstd))==length(cpfs_interviewed))


#Merge with survey data
survey_W2_v2 <- data_ind_lottery %>% left_join(answered, by = "cpfstd")
stopifnot(nrow(survey_W2_v2)==nrow(data_ind_lottery))

#Housekeeping
survey_W2_stacked <- survey_W2_v2 

####################################################################################
# Compute weights for IPW approach and include in dataset #edited 10/2022
####################################################################################

#Getting rid of duplicates
inscritos_2011_03 <- inscritosf %>% filter(id_edital == "edital03.2011")
inscritos_2011_03 <- inscritos_2011_03[!duplicated(inscritos_2011_03[ , c("cpfstd")]),]  #duplicated here from pdf
n_inscritos_311 <- inscritos_2011_03 %>% count()

n_sorteados_311 <- sorteadosf %>% filter(id_edital == "edital03.2011") %>% count()

p0311 <- n_sorteados_311/n_inscritos_311
w0311 <- as.numeric(1/p0311)

weight_treat_0311 <- w0311
weight_control_0311 <- as.numeric(1/(1-p0311))

n_inscritos_611 <- inscritosf %>% filter(id_edital == "edital06.2011") %>% count()
n_sorteados_611 <- sorteadosf %>% filter(id_edital == "edital06.2011") %>% count()

p0611 <- n_sorteados_611/n_inscritos_611
w0611 <- as.numeric(1/p0611)

weight_treat_0611 <- w0611
weight_control_0611 <- as.numeric(1/(1-p0611))

survey_W2_stacked <- survey_W2_stacked %>% mutate(ipw = ifelse(TT == "T032011", weight_treat_0311,
                                                          ifelse(TT == "C032011", weight_control_0311, 
                                                          ifelse(TT == "T062011", weight_treat_0611, 
                                                          ifelse(TT == "C062011", weight_control_0611, NA)))))

######### Adding Rais data

#Joining
survey_W2_stacked <- survey_W2_stacked %>% left_join(all_wide_rais_all_editais, by = "cpfstd")

stopifnot(nrow(survey_W2_stacked)==nrow(survey_W2_v2))

#Smell check
table(survey_W2_stacked$edital03.2011, survey_W2_stacked$edital06.2011)

surveyW2 <- survey_W2_stacked

############################################## Adding pre-treat CADUNICO (as of 12/2011)

load(paste0(home, "DB/BASE_PESQUISADOR_PESS_122011.RData"))

#first of two lotteries scheduled for 13/08/2011. 
#keep only those enrolled in CADUNICO well before this date
p$dta_cadastramento_memb <- as.Date(p$dta_cadastramento_memb)
p <- subset(p,dta_cadastramento_memb<as.Date("2011-06-01"))
cadunico <- data.frame(cpfstd=cpfs_interviewed
                        ,cadunico=is.element(cpfs_interviewed,p$cpfstd))
surveyW2 <- surveyW2 %>% left_join(cadunico, by = "cpfstd")

############################################## Getting partner's CPFS from  CADUNICO (as of 12/2013)
# Moved this up so that it is done before the merge with Caixa data 
# Using 2013 because by then all beneficiaries would have signed up to Cadunico
#1 get full (CSV) dataset that includes a code for type of relative (not in abbreviated dataset)
#2 identify families of subjects in our dataset
#3 get all family members
#4 keep relatives of code =1 or =2
#5 merge all family members with Caixa Economica data
#6 merge compliance by family back into dataset

#1 get data
compcadunico <- read.csv2(paste0(home, "DB/BASE_PESQUISADOR_PESS_122013.csv"))
compcadunico <- subset(compcadunico,select=c(cod_familiar_fam, 
                                             cod_parentesco_rf_pessoa, 
                                             nom_pessoa,
                                             num_nis_pessoa_atual,
                                             dta_nasc_pessoa,
                                             num_cpf_pessoa))
compcadunico$cpfstd <- sprintf("%011.0f", compcadunico$num_cpf_pessoa)#
compcadunico$cpfstd[is.na(compcadunico$num_cpf_pessoa)] <- NA
compcadunico$num_cpf_pessoa <- NULL

#2 identify families in the sample
surveyW2 <- surveyW2 %>% left_join(subset(compcadunico,select=c(cpfstd,cod_familiar_fam)),by="cpfstd") 
fam.in.sample <- compcadunico$cod_familiar_fam[is.element(compcadunico$cpfstd,cpfs_interviewed )]
cat("Found",sum(is.element(compcadunico$cpfstd,cpfs_interviewed ))
    ,"individuals from our sample in CADUNICO 2013/12,",
    length(unique(fam.in.sample)),"families.\nAs of 2011/09 there were only",
    sum(cadunico$cadunico)," individuals in CADUNICO.")

#3 get all family members
all.relatives  <- compcadunico %>% filter(is.element(cod_familiar_fam,fam.in.sample))
all.relatives$in.W2 <- F #include indicator for being in sample, to facilitate back merge at the end
all.relatives$in.W2[is.element(all.relatives$cpfstd,cpfs_interviewed )]<-T
cat("Found",nrow(all.relatives)
    ,"individuals in these",
    length(unique(all.relatives$cod_familiar_fam)),"CADUNICO families\n")

#4 keep only relatives code 1 or 2 (husband or wifes or equivalents)
relatives <- subset(all.relatives,cod_parentesco_rf_pessoa==1|cod_parentesco_rf_pessoa==2)
relatives <- subset(relatives,is.na(cpfstd)==F)
cat("Keeping",nrow(relatives)
    ,"selected relatives with valid CPFS from these",
    length(unique(relatives$cod_familiar_fam)),"CADUNICO families\n")


##############################################  Look for compliers in CEF data
#This whole part is done for individuals (identify if individuals signed a contract)

#0. Spelling according to the CEF document
empreendimentos_edital03_2011 <- c("PARK IMPERIAL", "PARK ROYAL", "DESTRI", "TOLEDO", "RES RIO BONITO", "ESTORIL")
empreendimentos_edital06_2011 <- c("SEVILHA", "TARONI", "TOLEDO", "CAPCAIS")
empreendimentos_W2 <- union(empreendimentos_edital03_2011,empreendimentos_edital06_2011)

#1. get relevant information from surveys to relatives object by family
survey.vars <- subset(surveyW2, is.na(cod_familiar_fam)==F,
                            select=c(cod_familiar_fam,id_edital,cpfstd)) %>%
                            rename(cpfstd_winner=cpfstd)
survey.vars <- survey.vars[which(duplicated(survey.vars$cpfstd_winner)),] #keep one obs per individual

# 2. now doing the same for actual interviewees/lottery (not individuals)
interviewed <- subset(surveyW2,select=c(cod_familiar_fam,cpfstd,id_edital))
interviewed <- interviewed[duplicated(interviewed$cpfstd)==F,]  #keep unique 
interviewed.compliers <- interviewed %>% left_join(subset(rj, select=c(cpfstd, Data_da_Assinatura, Nome_Empreendimento)), by = "cpfstd") %>% 
  mutate(year_signature = year(Data_da_Assinatura)) 
interviewed.compliers <- interviewed.compliers %>% mutate(contract = case_when(year_signature < 2014 &
                                                                                Nome_Empreendimento %in% empreendimentos_W2 ~ "contract"),
                                                          contract_ever = ifelse(is.na(year_signature), 0, 1))

                                                        
#3. create two objects for CEF merge, one for individuals in survey and their relatives
#   this makes it easier to handle few cases in which two individuals from the same family mattered
relatives  <-  relatives %>% filter(in.W2==F)
  
relatives <- relatives %>% left_join(survey.vars, by= "cod_familiar_fam") %>%
                  select(cod_familiar_fam,cpfstd,id_edital,cpfstd_winner)

relatives.compliers <- relatives %>% inner_join(subset(rj, select=c(cpfstd, Data_da_Assinatura, Nome_Empreendimento)), by = "cpfstd") %>%
                            mutate(year_signature = year(Data_da_Assinatura))

relatives.compliers <- relatives.compliers %>% mutate(contract = case_when(year_signature < 2014 &
                                                                            Nome_Empreendimento %in% empreendimentos_W2 ~ "contract"),
                                                      contract_ever = ifelse(is.na(year_signature), 0, 1)
                                                  )

#4. Merge back into the survey 
surveyW2 <- surveyW2 %>% left_join(subset(interviewed.compliers,select=c(cpfstd,contract,contract_ever)),by="cpfstd") %>% 
                                            rename(contract_individual = contract, contract_ever_individual = contract_ever)

surveyW2 <- surveyW2 %>% left_join(subset(relatives.compliers,select=c(cpfstd_winner,contract,contract_ever, cod_familiar_fam)),by=c("cod_familiar_fam")) %>% 
                                            rename(contract_relative = contract, contract_ever_relative = contract_ever)

# Check whether "compliers" were treated in either edital
check1 <- interviewed.compliers %>% left_join(sorteadosf,by="cpfstd")
print(table(check1$id_edital.x[check1$contract=="contract"],check1$id_edital.y[check1$contract=="contract"])) #four were treated in 09.2011
swich.to.D0 <- check1$cpfstd[which(check1$contract=="contract"&check1$id_edital.y=="edital09.2011")] #find their CPFs
surveyW2$contract_individual[which(is.element(surveyW2$cpfstd,swich.to.D0))] <- NA #correcting these to D=0

# Clean up
rm(relatives.compliers,interviewed.compliers,interviewed,relatives,survey.vars,fam.in.sample,compcadunico,all.relatives,swich.to.D0,check1)

#Creating list of subjects to purchase data on
#Getting CPFs for  purchase of birthdate
# cpfs_final_sample <- unique(surveyW2$cpfstd)
# cpfs_final_sample <- as.data.frame(as.matrix(cpfs_final_sample))
# 
# write_csv(cpfs_final_sample, "~/Dropbox/MCMV_replication/Wave_2/W2-1Sample/cpfs_full_sample.csv", 
#           col_names = F)


#Second round of Getting CPFs for purchase of birthdate
# tmp <- surveyW2 %>% filter(is.na(NASCIMENTO_ENR))
# cpfs_2round <- unique(tmp$cpfstd)
# cpfs_2round <- as.data.frame(as.matrix(cpfs_2round))
#  
# write_csv(cpfs_2round, "~/Dropbox/MCMV_replication/Wave_2/W2-1Sample/cpfs_full_2round.csv", 
#            col_names = F) 

#After getting both rounds of birthday data, it's better to combine them before joining
#Add birthdate
# Import vendor birthdate
dob <- read_csv2(paste0(home, "DB/job354260_enriquecimentoData.csv"))
dob2 <- read_csv2(paste0(home, "DB/job366105_enriquecimentoData.csv"))

#Binding rows
dobf <- bind_rows(dob, dob2)
dobf <- dobf %>% distinct(cpf, .keep_all = T) #There is one repeated CPF (attempted to get it in round 2)
dobf <- dobf %>% mutate(cpfstd = cpf)

surveyW2 <- surveyW2 %>% left_join(dobf, by = "cpfstd")

stopifnot(nrow(surveyW2)==nrow(survey_W2_stacked)) 

#Getting age 

surveyW2 <- surveyW2 %>% mutate(data_nasc = ymd(NASCIMENTO_ENR),
                         idade = interval(start = data_nasc, end = ymd("2020-05-15"))/duration(num = 1, units = "years"))

############################################## Removing personal identifiers
surveyW2 <- surveyW2 %>% select(-cpfstd, -names.x, -names.y, -names.clean, -cpf.x, -cpf3, -cpf,
                                -latitude, -latitude, -envio_email, -dados,-facebook,  -q94_numero, -cpfstd_winner,
                                -q94_nome, -q95_endereco, -nome_completo,  -cpf2.x, -cpf2.y, -NASCIMENTO_ENR, -cpf.y, -STATUS_ENR, -data_nasc)


save(surveyW2, file = paste0(home, "DB/surveyw2_prerecode.Rda"))
survey <- surveyW2
survey <- survey %>% mutate(treat_ind = ifelse(TT %in% c("T032011", "T062011"), 1, 0))

#Recoding variables
survey <- survey %>% mutate(
  Z = treat_ind, 
  inscreveu = dplyr::recode(q74, `1` = 1, `2` = 0,  .default = as.numeric(NA_character_)), 
  inscreveu_r = dplyr::recode(q74, `1` = 1, `2` = 0, `8` = 0, .default = as.numeric(NA_character_)), 
  resultado = dplyr::recode(q80, `1` = 1,`2` = 1, `88` = 0, .default = as.numeric(NA_character_)), 
  ganhou = dplyr::recode(q80, `1` = 1, `2` = 0, `88` = 0, .default = as.numeric(NA_character_)),
  ganhou_r = dplyr::recode(q80, `1` = 1, `2` = 0, `88` = 0, `99` = 0,  .missing = 0),
  keys = dplyr::recode(q81_new, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
  keys_r = dplyr::recode(q81_new, `1` = 1, `2` = 0, .missing = 0),
  D_survey = keys_r,
  contract_ever = ifelse(contract_ever_individual == 1|contract_ever_relative == 1, 1, 0),#ever signed contract in window
  contract_ever = ifelse(is.na(contract_ever), 0, contract_ever),
  D = contract_ever
  )

#Fixing q25_b_new_other for recode
survey$q25_b_new_other[survey$q25_b_new_other == "1900 - ALEGA SER L[IDER COMUNITARIO MAIS DE 15 ANOS"]<- "1900"
survey$q25_b_new_other[survey$q25_b_new_other == "5000 - PERCEBEU-SE SER UMA PESSOA POLITIZADA, ATIVISTA, LIDER NA COMUNIDADE"]<- "5000"

survey <- survey %>% mutate(votept_2018_d = dplyr::recode(q61_new, `1` = 0, `2` = 1, `77` = 0, .default = as.numeric(NA_character_)),#vote for haddad 
                            voteinc_2016_d = dplyr::recode(q59, `3` = 1, `1` = 0, `2` = 0, `4` = 0, `5` = 0, `6` = 0, `7` = 0,
                                                           .default = as.numeric(NA_character_)),
                            votepaes_2020_d = dplyr::recode(q59_new, `1` = 0, `2` = 1, `3` = 0, .default = as.numeric(NA_character_)),
                            turnout_2016 = dplyr::recode(q56, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            turnout_2018 = dplyr::recode(q60_new, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            turnout_2020 = dplyr::recode(q59_new, `1` = 1, `2` = 1, `3` = 1, `4`= 0, .default = as.numeric(NA_character_)),
                            bolso_eval = dplyr::recode(q47_new, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            temer_eval = dplyr::recode(q49, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            dilma_eval = dplyr::recode(q47, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            lula_eval = dplyr::recode(q48, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            crivella_eval = dplyr::recode(q50new, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            paes_eval = dplyr::recode(q50, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)), 
                            pezao_eval = dplyr::recode(q51, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            witzel_eval = dplyr::recode(q51_new, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            party_aff = dplyr::recode(q52, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            party_pt = dplyr::recode(q53, `10` = 1, `1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, 
                                                     `6` = 0, `7` = 0, `8` = 0,`9` = 0,`11` = 0, `12` = 0
                                                     ,`66` = 0, .default = as.numeric(NA_character_)),
                            party_pt_r = dplyr::recode(q53, `10` = 1, `1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, 
                                                       `6` = 0, `7` = 0, `8` = 0,`9` = 0,`11` = 0, `12` = 0
                                                       ,`66` = 0, `88`=0, `99`=0, .missing  = as.double(0)),
                            party_pmdb = dplyr::recode(q53, `2` = 1, `1` = 0,  `3` = 0, `4` = 0, `5` = 0, 
                                                       `6` = 0, `7` = 0, `8` = 0,`9` = 0,`10` = 0,`11` = 0, `12` = 0
                                                       ,`66` = 0, .default = as.numeric(NA_character_)),
                            party_pdt = dplyr::recode(q53, `4` = 1, `1` = 0, `2` = 0, `3` = 0,  `5` = 0, 
                                                      `6` = 0, `7` = 0, `8` = 0, `9` = 0,`10` = 0 ,`11` = 0, `12` = 0
                                                      ,`66` = 0, .default = as.numeric(NA_character_)),
                            party_psol = dplyr::recode(q53, `8` = 1, `1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, 
                                                       `6` = 0, `7` = 0, `9` = 0,`10` = 0 ,`11` = 0, `12` = 0
                                                       ,`66` = 0, .default = as.numeric(NA_character_)),
                            party_anti = dplyr::recode(q54, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            anti_pt_r = as.numeric(grepl("10",survey$q55)), #fulcrum allowed multiple responses, so look for "10"
                            client_g = dplyr::recode(q57, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            client_support = dplyr::recode(q57_new1, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            talk_cand = dplyr::recode(q57_new2, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            talk_ncand = dplyr::recode(q57b_new, `1` = 1, `2` = 0, `88` = 1, .default = as.numeric(NA_character_)), 
                            talk_remember = dplyr::recode(q57e_new, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            talk_vote = dplyr::recode(q57f_new, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)), 
                            talk_recent = dplyr::recode(q57g_new, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            cantril = ifelse(q8 < 11, q8, NA), 
                            q10r = dplyr::recode(q10, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_1r = dplyr::recode(q9_1, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_2r = dplyr::recode(q9_2, `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_3r = dplyr::recode(q9_3, `1` = -1, `2` = 0, .default = as.numeric(NA_character_)),
                            q9_4r = dplyr::recode(q9_4, `1` = -1, `2` = 0, .default = as.numeric(NA_character_)),
                            emotions = q9_1r + q9_2r + q9_3r + q9_4r,
                            poor = dplyr::recode(q36a_new , `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            placement = ifelse(q36 < 11, q36, NA), 
                            govind  = dplyr::recode(q11, `2` = 1, `1` = 0, `3` = 0, .default = as.numeric(NA_character_)), 
                            competition  = dplyr::recode(q12, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)), 
                            retrosp_econself =  dplyr::recode(q37a_new, `1` = 2, `2` = -2, `3` = 0, `8` = as.numeric(NA_character_), 
                                                              .default = as.numeric(NA_character_)), 
                            retrosp_reason = car::recode(ifelse(q37a_new==1,q37b1_new,
                                                                ifelse(q37a_new==2,q37b2_new,
                                                                       ifelse(q37a_new==3,q37b3_new,NA))),"'88'=NA"),
                            retrosp_talenteffort = as.numeric(retrosp_reason==1|retrosp_reason==2),
                            retrosp_help = as.numeric(retrosp_reason==3),
                            retrosp_luck = as.numeric(retrosp_reason==4),
                            retrosp_gov = as.numeric(retrosp_reason==5),
                            current_rel =  dplyr::recode(q37d_new, `1` = 1, `2` = 0, `3` = -1, .default = as.numeric(NA_character_)), 
                            #renamed old q37_r and exp_indexshort to prosp_econself 
                            prosp_self = dplyr::recode(q37, `1` = 1, `2` = -1, `3` = 0, .default = as.numeric(NA_character_)), 
                            prosp_socio = dplyr::recode(q37c_new, `1` = 1, `2` = -1, `3` = 0, .default = as.numeric(NA_character_)), 
                            dream = dplyr::recode(q38a, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            #In the next items, high values mean right wing (lower taxes or less redistribution)
                            tax_item = dplyr::recode(q43, `1` = 1, `2` = -1, `3` = 0, .default = as.numeric(NA_character_)),
                            red_abstract = dplyr::recode(q45, `2` = 1, `1` = 0, .default = as.numeric(NA_character_)), 
                            red_concrete = dplyr::recode(q46, `1` = 1, `2` = 0, `3`= -1, .default = as.numeric(NA_character_)),
                            red_concrete_v2 = dplyr::recode(q46, `1` = 1, `2` = -1, `3`= -1, .default = as.numeric(NA_character_)),
                            #For components of the market index, high values mean pro-market/right wing
                            effortbetter = dplyr::recode(q13, `1` = 1, `2` = 1, `3` = 0, 
                                                 `4` = 0, `5` = 0, `6` = 0, .default = as.numeric(NA_character_)),
                            trustothers = dplyr::recode(q14, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),
                            successalone = dplyr::recode(q15, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),
                            moneyimportant = dplyr::recode(q16, `1` = 1, `2` = 0, `3` = 0, .default = as.numeric(NA_character_)),                       
                            religiao_dummy = dplyr::recode(religiao, `1` = 1, `2` = 1, `3` = 1, `4` = 1, `5` = 1, `8` = 1, `6`=0, `7`=0, .default = as.numeric(NA_character_)),    
                            religiao_prot = ifelse(religiao == 2|religiao==3, 1, 0), 
                            sexor = dplyr::recode(sexo, `1` = 1, `2` = 0), #male = 1
                            idade_old = as.numeric(idade > median(idade,na.rm=T)),
                            race_bin = dplyr::recode(cor, `1` = 1, `2` = 0, `3` = 0, `4` = 0, `5` = 0,
                                                     .default = as.numeric(NA_character_)),
                            sch_years = dplyr::recode(esc, `1` = 0, `2` = 1, `3` = 4, `4` = 8, `5` = 11, `6` = 12, `7` = 15, 
                                                      .default = as.numeric(NA_character_)),
                            sch_high = dplyr::recode(esc, `1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 1, `6` = 1, `7` = 1, 
                                                     .default = as.numeric(NA_character_)),
                            #renamed these work related variables
                            work_status =  ifelse(q26>7,NA,q26),
                            work_activity = ifelse(q28>7,NA,q28),
                            pea = ifelse(q27 == 1, 1, 0),
                            inc_r = ifelse(rendaf > 10, NA, rendaf),
                            inc_tmp = dplyr::recode(as.numeric(rendaf_1), `1` = 2, `2` = 1, .default = as.numeric(NA_character_)),
                            inc_above1mw = as.numeric(ifelse(is.na(inc_r), inc_tmp, inc_r)>1),
                            commute = as.numeric(as.duration(q32),"minutes"),
                            commute_d = ifelse(commute < median(commute, na.rm = T), 0 , 1),
                            commute_mode = ifelse(q33 > 10, NA, q33),
                            homeowner = dplyr::recode(q17, `1` = 1, `2` = 0, `3` = 0, `4` = 0, `5` = 0, 
                                                      .default = as.numeric(NA_character_)),
                            homerenter = dplyr::recode(q17, `1` = 0, `2` = 1, `3` = 0, `4` = 1, `5` = 0, 
                                                       .default = as.numeric(NA_character_)),
                            homesquat = dplyr::recode(q17, `1` = 0, `2` = 0, `3` = 1, `4` = 1, `5` = 1, 
                                                      .default = as.numeric(NA_character_)),                          
                            homestatus = factor(dplyr::recode(q17, `1` = 0, `2` = 1, `3` = 2, `4` = 2, `5` = 2, 
                                                              .default = as.numeric(NA_character_))), 
                            #renamed rent_spend to: 
                            spend_home = ifelse(q18_new > 10, NA, q18_new),
                            #renames variables for satisfaction with current home
                            satisf_size = ifelse(q22_1>10,NA,q22_1),
                            satisf_conf = ifelse(q22_2new>10,NA,q22_2new),
                            satisf_cost = ifelse(q22_3new>10,NA,q22_3new),
                            #variables for satisfaction with neighborhood
                            satisfn_violence = ifelse(q23_1>10,NA,q23_1),
                            satisfn_transp = ifelse(q23_3>10,NA,q23_3),
                            satisfn_health = ifelse(q23_4>10,NA,q23_4),
                            satisfn_educ = ifelse(q23_5>10,NA,q23_5),
                            netwk_help = dplyr::recode(q25new, `1` = 2, `2` = 1, `3` = -1, `4` = -2,  
                                                       .default = as.numeric(NA_character_)),
                            netwk_size = as.numeric(survey$q25_b_new_other),
                            netwk_size_d = ifelse(netwk_size < median(netwk_size, na.rm = T), 0, 1),
                            #mobilization variables
                            assoc = dplyr::recode(q46_1_new, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5`= -2,  
                                                  .default = as.numeric(NA_character_)),
                            contact_gov = dplyr::recode(q46_2_new , `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            contact_civsoc = dplyr::recode(q46_3_new , `1` = 1, `2` = 0, .default = as.numeric(NA_character_)),
                            contact_collectively = dplyr::recode(q46_4_new , `1` = 0, `2` = 1, .missing=0, .default = as.numeric(0)),
                            #coded NAs (those who didnt petition) as 0's in contact collectively, above...
                            contact_demand = as.numeric(ifelse(q46_5_new > 10, NA, q46_5_new)),
                            contact_transport =  as.numeric(contact_demand==1),
                            contact_healtheduc = as.numeric(contact_demand==2),
                            contact_infrastructure = as.numeric(contact_demand==3),
                            contact_security = as.numeric(contact_demand==4),
                            contact_other = as.numeric(contact_demand==5),
                            contact_demand_v1 = dplyr::recode(q46_5_new, `1` = 1, `2` = 2, 
                                                             `3` = 3, `4` = 4, `5` = 5, `88` = 88, `99` = 99,
                                                             .missing=0, .default = as.numeric(0)),
                            contact_demand_v2 = ifelse(contact_demand_v1 > 80, NA, contact_demand_v1),
                                                
                            contact_transport_v2 =  as.numeric(contact_demand_v2==1),
                            contact_healtheduc_v2 = as.numeric(contact_demand_v2==2),
                            contact_infrastructure_v2 = as.numeric(contact_demand_v2==3),
                            contact_security_v2 = as.numeric(contact_demand_v2==4),
                            contact_other_v2 = as.numeric(contact_demand_v2==5),
                            effectiveness_gov =  dplyr::recode(q46_7_new, `1` = 2, `2` = 1, `4`=-1, `5`=-2, .default = as.numeric(NA_character_)),
                            effectiveness_self = dplyr::recode(q46_7b_new, `1` = 2, `2` = 1, `4`=-1, `5`=-2, .default = as.numeric(NA_character_)),
                            #MCMV
                            q62r = ifelse(q62 < 6, q62, NA), 
                            improved_lives = dplyr::recode(q62r, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, .default = as.numeric(NA_character_)),
                            q63r = ifelse(q63_new < 90, q63_new, NA), 
                            resp_lula = ifelse(q63r == 3, 1, 0),
                            resp_lula_dilma = ifelse(q63r == 3 |q63r == 2, 1, 0),
                            resp_lula_dilma_pt = ifelse(q63r == 3 |q63r == 2 | q63r == 8, 1, 0),
                            approve_lottery = dplyr::recode(q64, `1` = 2, `2` = 1, `3` = 0, `4` = -1, `5` = -2, 
                                                            .default = as.numeric(NA_character_)),
                            q65r = ifelse(q65_new < 5, q65_new, NA),
                            continue_bolsonaro = dplyr::recode(q65r, `1` = 2, `2` = 1, `3` = -1, `4` = -2, 
                                                               .default = as.numeric(NA_character_)),
                            q66r = ifelse(q66_new < 5, q66_new, NA),
                            continue_lula = dplyr::recode(q66r, `1` = 2, `2` = 1, `3` = -1, `4` = -2, 
                                                          .default = as.numeric(NA_character_)),       
                            learned_how = as.numeric(ifelse(q75 < 10, q75, NA)),
                            learned_friendsfamily = as.numeric(learned_how == 1),
                            learned_media =  as.numeric(learned_how == 2|learned_how == 6|learned_how == 7),
                            learned_networks = as.numeric(learned_how == 3|learned_how == 4|learned_how == 5),
                            #"would move" includes those who got the keys and those who would have had they been sorteado
                            q87r = dplyr::recode(q87, `1` = 1, `2` = 0, `3` = 0, `4`= 0,
                                                 .default = as.numeric(NA_character_)),
                            would_move = ifelse(is.na(q87r),keys_r,q87r),
                            hope_better_life = dplyr::recode(q90, `1` = 1, `2` = 0,
                                                             .default = as.numeric(NA_character_)),
                            r2003 = ifelse(is.na(`2003`), 0, `2003`),
                            r2004 = ifelse(is.na(`2004`), 0, `2004`),
                            r2005 = ifelse(is.na(`2005`), 0, `2005`),
                            r2006 = ifelse(is.na(`2006`), 0, `2006`),
                            r2007 = ifelse(is.na(`2007`), 0, `2007`),
                            r2008 = ifelse(is.na(`2008`), 0, `2008`),
                            r2009 = ifelse(is.na(`2009`), 0, `2009`),
                            r2010 = ifelse(is.na(`2010`), 0, `2010`),
                            rp2013 = ifelse(is.na(`2013`), 0, `2013`), 
                            rp2014 = ifelse(is.na(`2014`), 0, `2014`), 
                            lottery1 = ifelse(id_edital == "edital03.2011", 1, 0))
survey$inc_tmp <- NULL 

# Create summary RAIS variables pre-treatment 
survey$av_pre <- rowSums(survey[,grep("^r2\\d\\d\\d$",names(survey))],na.rm=T)/8  
survey$av_prel  <- log(1+survey$av_pre)
survey$formal_pre <- rowSums(survey[,grep("^r2\\d\\d\\d$",names(survey))]!=0)
survey$formal_pre_prop <- rowSums(survey[,grep("^r2\\d\\d\\d$",names(survey))]!=0)/8

# Create summary RAIS variables post-treatment (2013 and 2014)
survey$av_post <- rowSums(survey[,grep("^rp2\\d\\d\\d$",names(survey))],na.rm=T)/2  
survey$av_postl  <- log(1+survey$av_post)
survey$formal_post <- rowSums(survey[,grep("^rp2\\d\\d\\d$",names(survey))]!=0)
survey$formal_post_prop <- rowSums(survey[,grep("^rp2\\d\\d\\d$",names(survey))]!=0)/2

#Creating the Kahneman & Deaton positive affect and blue affect indices
#This is not a normalized index, but we also employ impuation
kahndeat_index_vars <- survey %>% select(q9_1r,q9_2r,q10r,q9_3r,q9_4r)
kahndeat_index_vars <- imp.mice(kahndeat_index_vars)
survey$kahndeat_pos  <- apply(subset(kahndeat_index_vars,select=c(q9_1r,q9_2r,q10r)), 1 ,mean)
survey$kahndeat_blue  <- -1*apply(subset(kahndeat_index_vars,select=c(q9_3r,q9_4r)), 1 ,mean) 
rm(kahndeat_index_vars)

# Create Standardized Indices 
# We create standardized additive indices after imputing missing observations
# We employ multiple imputation within the set of items of each index
# If all items are missing, we drop the observation
exp_index_vars <- imp.mice(survey %>% select(prosp_self, prosp_socio))
red_index_vars <- imp.mice(survey %>% select(tax_item, red_abstract, red_concrete)) 
red_index_vars_v2 <- imp.mice(survey %>% select(tax_item, red_abstract, red_concrete)) 
        # PAP predicted tax_item as a separate outcome, and predicted three components to redistirbution.
        # Cant'have it both ways, so we decided to drop the tax_item as a separate item:
market_index_vars <- imp.mice(survey %>% select(effortbetter,
                                                trustothers,
                                                successalone,
                                                moneyimportant)) #market_index = q13r + q14r + q15r + q16r,
self_index_vars <- imp.mice(survey %>% select(govind, competition)) #self_index = q11r + q12r, 
hap_index_vars <- imp.mice(survey %>% 
                             select(kahndeat_pos, kahndeat_blue, cantril) %>%
                             mutate(kahndeat_blue=-1*kahndeat_blue)) #invert blue so that polarity is the same for all items
satisfh_index_vars <- imp.mice(survey %>% select(satisf_size,satisf_conf,satisf_cost))
satisfn_index_vars <- imp.mice(survey %>% select(satisfn_violence,satisfn_transp
                                                 ,satisfn_health,satisfn_educ))
engage_index_vars <- imp.mice(survey %>% select(contact_gov,contact_civsoc))
effectiveness_index_vars <- imp.mice(survey %>% select(effectiveness_gov,effectiveness_self))

survey <- survey %>% bind_cols(
  red_index = calculate_mean_effects_index(Z = survey$Z
                                           , outcome_mat = red_index_vars, greedy = TRUE ), 
  red_index_v2 = calculate_mean_effects_index(Z = survey$Z
                                           , outcome_mat = red_index_vars_v2, greedy = TRUE ), 
  market_index = calculate_mean_effects_index(Z = survey$Z
                                              , outcome_mat = market_index_vars, greedy = TRUE ), 
  self_index = calculate_mean_effects_index(Z = survey$Z      
                                            , outcome_mat = self_index_vars, greedy = TRUE ),
  hap_index =  calculate_mean_effects_index(Z = survey$Z
                                            , outcome_mat = hap_index_vars,  greedy = TRUE ), 
  exp_index = calculate_mean_effects_index(Z = survey$Z
                                           , outcome_mat = exp_index_vars,  greedy = TRUE ),
  satisfh_index = calculate_mean_effects_index(Z = survey$Z
                                               , outcome_mat = satisfh_index_vars,  greedy = TRUE ),
  satisfn_index = calculate_mean_effects_index(Z = survey$Z
                                               , outcome_mat = satisfn_index_vars, greedy = TRUE ),
  engage_index = calculate_mean_effects_index(Z = survey$Z
                                              , outcome_mat = engage_index_vars, greedy = TRUE ),
  effectiveness_index = calculate_mean_effects_index(Z = survey$Z
                                                     , outcome_mat = effectiveness_index_vars, greedy = TRUE ))

rm(red_index_vars,red_index_vars_v2,
   market_index_vars,self_index_vars
   ,hap_index_vars,exp_index_vars
   ,satisfh_index_vars,satisfn_index_vars
   ,effectiveness_index_vars,engage_index_vars)

#Inputation of Covariates
#Checking if missing exceeds 10%
#In final analysis, this might change
prop.table(table(is.na(survey$religiao_prot)))*100
prop.table(table(is.na(survey$race_bin)))*100
prop.table(table(is.na(survey$sch_high)))*100
prop.table(table(is.na(survey$religiao_prot)))*100
prop.table(table(is.na(survey$sexor)))*100
prop.table(table(is.na(survey$formal_pre)))*100
prop.table(table(is.na(survey$av_prel)))*100
prop.table(table(is.na(survey$idade)))*100

survey$religiao_prot <- imp.meanmode(survey$religiao_prot)#inputing the mode
survey$race_bin  <- imp.meanmode(survey$race_bin)
survey$sch_high <- imp.meanmode(survey$sch_high)
survey$sexor <- imp.meanmode(survey$sexor)
survey$formal_pre <- imp.meanmode(survey$formal_pre)
survey$av_prel <- imp.meanmode(survey$av_prel)
survey$idade <- imp.meanmode(survey$idade)

#saving complete recoded dataset
comment(survey) <- paste("Dataset created in ",as.Date(Sys.Date()),".",sep="")
save(survey, file = paste0(home, "DB/surveyW2.Rda"))

######### Creating the JoP dataset (a subset of the general dataset)

survey <- survey %>% select(fieldID, Z, D, id_edital, sexor, TT, 
                            race_bin, religiao_dummy,
                            idade,
                            sch_years, cadunico,
                            formal_pre, av_prel, 
                            lottery1,
                            av_postl,
                            formal_post, 
                            prosp_socio,
                            tax_item,
                            red_abstract,
                            red_concrete,
                            effortbetter,
                            trustothers,
                            successalone,
                            moneyimportant,
                            govind, 
                            competition, 
                            placement,
                            pea,
                            inc_above1mw, 
                            self_index,
                            effectiveness_index,
                            market_index,
                            red_index, hap_index, 
                            prosp_self, 
                            assoc,
                            talk_recent,
                            contact_gov,
                            contact_civsoc,
                            contact_collectively,
                            effectiveness_gov,
                            effectiveness_self, 
                            contact_transport_v2,      
                            contact_healtheduc_v2,     
                            contact_infrastructure_v2,           
                            contact_security_v2,         
                            current_rel,  
                            commute,         
                            netwk_help,         
                            netwk_size_d,     
                            retrosp_econself,                 
                            retrosp_gov,             
                            retrosp_talenteffort,              
                            satisfh_index,      
                            satisf_size,              
                            satisf_conf,             
                            satisf_cost,              
                            satisfn_index,   
                            satisfn_violence,         
                            satisfn_transp,         
                            satisfn_health,           
                            satisfn_educ,            
                            talk_cand,
                            hope_better_life,           
                            would_move,              
                            client_support,
                            turnout_2016,
                            turnout_2018,
                            turnout_2020,
                            client_g, 
                            r2003, 
                            r2004,
                            r2005,
                            r2006,
                            r2007, 
                            r2008, 
                            r2009, 
                            r2010,
                            edital03.2011, edital06.2011, edital17.2016, edital20.2016,
                            kahndeat_pos, kahndeat_blue, cantril, ipw) 

comment(survey) <- paste("Dataset created in ",as.Date(Sys.Date()),".",sep="")
save(survey, file = "~/Dropbox/Replication-Making-JoP/Data/surveyW2-Making.Rda")



################ Create Attrition datasets

#Among those attempted
#dependent variable: interviewed, did not get interviewed
#unit of analysis is also individual-lottery

#get all attempted
table(bd$he) #no high effort people

#create individual-lottery format

#Removing NA CPFSTD
bd2 <- bd %>% filter(!is.na(cpfstd))
attempted <- unique(bd2$cpfstd) #getting complete answers #15436

#Adding individuals sent and processed by the survey company but who were never attempted to be contacted
W2.1_processed <- read_excel(paste0(home, "DB/dataforfieldW2.1 (6) + HIGIENIZAÇÃO.xlsx"), 
                             sheet = 2)

#Merged with cpf for analysis
W2.1_forfieldcpf <- read_csv(paste0(home, "DB/dataforfieldW2.1withcpf.csv"))
W2.1_processed <- W2.1_processed %>% left_join(W2.1_forfieldcpf, by = "fieldID")

W2.2_processed <- read_excel(paste0(home, "DB/dataforfieldW2.2withcpf - Higienizada e enriquecerida - Campo.xlsx"), 
                             sheet = 1)

W2.2_processed <- W2.2_processed %>% filter_at(vars(DDD1_ENR, FONE1_ENR, 
                                                    DDD2_ENR, FONE2_ENR, 
                                                    DDD3_ENR, FONE3_ENR, 
                                                    ddd1_enr_2, fone1_enr_2, 
                                                    ddd2_enr_2, fone2_enr_2,
                                                    ddd3_enr_2, fone3_enr_2,
                                                    ddd4_enr_2, fone4_enr_2, 
                                                    ddd5_enr_2, fone5_enr_2), any_vars(!is.na(.)))


#Others that were later processed by e-field
tm1 <- setdiff(W2.1_processed$cpfstd, attempted)
tm2 <- setdiff(W2.2_processed$CPF, attempted)
common <- intersect(tm1, tm2) #0
length(tm1) + length(tm2)

W2.2_processed <- W2.2_processed %>% select(names.clean, CPF) %>% rename(cpfstd = CPF)
W2.1_processed <- W2.1_processed %>% select(names.clean, cpfstd)
W2_processed <- bind_rows(W2.1_processed, W2.2_processed)

missing <- W2_processed %>% filter(!cpfstd %in% attempted)

#Combining attempted and missing for table 1
attempted_all <- unique(c(attempted, missing$cpfstd))
length(attempted_all)

### Read the "sorteados" file 
load(paste0(home, "DB/sorteados_october2018.Rda"))
### Read the "inscritos" file
load(paste0(home, "DB/inscritos_october2018.Rda"))

#Select lotteries inscritos
attempted_inscritos <- inscritosf %>% filter(cpfstd %in% attempted_all)
stopifnot(length(unique(attempted_inscritos$cpfstd))==length(attempted_all)) 
attempted_inscritos <- attempted_inscritos %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))

table(attempted_inscritos$id_edital) #smell check

#Selecting lotteries winners in the attempts
attempted_winners <- sorteadosf %>% filter(cpfstd %in% attempted_all)
attempted_winners <- attempted_winners %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))
table(attempted_winners$id_edital) #smell check

#Creating TT labels
attempted_winners <- attempted_winners %>% mutate(TT1 = case_when(id_edital == "edital03.2011" ~ "T032011", 
                                                    id_edital == "edital06.2011" ~ "T062011")) %>% select(cpfstd, names, TT1, id_edital)

attempted_inscritos2 <-  left_join(attempted_inscritos, attempted_winners, by = c("cpfstd", "id_edital"))
nrow(attempted_inscritos2) == nrow(attempted_inscritos) 

attempted_inscritos2 <- attempted_inscritos2 %>% mutate(TT = case_when(id_edital =="edital03.2011" & is.na(TT1) ~ "C032011", 
                                                                       id_edital =="edital06.2011" & is.na(TT1) ~ "C062011", 
                                                                       id_edital =="edital03.2011" & !is.na(TT1) ~ "T032011", 
                                                                       id_edital =="edital06.2011" & !is.na(TT1) ~ "T062011"))
table(attempted_inscritos2$TT)

#Stacking for individual-lottery format
data_ind_lottery_attempts <- attempted_inscritos2
#stopifnot(length(unique(data_ind_lottery_attempts$cpfstd))==length(attempted_all)) 
#tmp <- inscritosf %>% filter(cpfstd %in% c("05635614752", "09626075767"))
#tmp <- W2 %>% filter(cpfstd %in% c("05635614752", "09626075767")) #examined  ids
#review 09/11 -- two ids extra

#Adding Rais data
load(paste0(home, "DB/all_wide_rais_october.Rda"))

#Joining
W2_attrition_overall <- data_ind_lottery_attempts %>% left_join(all_wide_rais_all_editais, by = "cpfstd")

stopifnot(nrow(W2_attrition_overall)==nrow(data_ind_lottery_attempts))

#operationalize dependent variable (interviewed == 1, not interviewed == 0)

answered <- bd %>% mutate(answered = ifelse(b2 == 1, 1, 0), 
                          answered2 = ifelse(c1 == 1, 1, 0)) %>% filter(answered == 1 | answered2 == 1)
cpfs_answered <- unique(answered$cpfstd)

#Creating indicator

W2_attrition_overall <- W2_attrition_overall %>% mutate(interviewed = ifelse(cpfstd %in% cpfs_answered, 1, 0))

#Smell checks
table(W2_attrition_overall$interviewed)
table(is.na(W2_attrition_overall$interviewed))

#Treatment indicator
W2_attrition_overall <- W2_attrition_overall %>% mutate(treat_ind = ifelse(TT %in% c("T032011", "T062011"), 1, 0))

#Smell check
table(W2_attrition_overall$edital03.2011, W2_attrition_overall$edital06.2011)

#Getting sex indicator
tmp1 <- get_gender(W2_attrition_overall$names.clean, state = "RJ")
tmp2 <- tibble(tmp1)
W2_attrition_overall <- bind_cols(W2_attrition_overall, tmp2)

#Creating id
# redacted because it reveals private information 

#Making data anonymous

W2_attrition_overall <- W2_attrition_overall %>% select(-cpfstd, -names.x, -names.clean, -sum_participation, 
                           -sum_participation_geral, -sum_participation_territorial, -names.y, -cpf.y,
                           -sum_wins, -sum_wins_geral, -sum_wins_territorial, -cpf.x, -cpf2.x, -cpf2.y, 
                           -cpf3)

W2_attrition_overall <- W2_attrition_overall %>% mutate(r2003 = ifelse(is.na(`2003`), 0, `2003`),
                                                        r2004 = ifelse(is.na(`2004`), 0, `2004`),
                                                        r2005 = ifelse(is.na(`2005`), 0, `2005`),
                                                        r2006 = ifelse(is.na(`2006`), 0, `2006`),
                                                        r2007 = ifelse(is.na(`2007`), 0, `2007`),
                                                        r2008 = ifelse(is.na(`2008`), 0, `2008`),
                                                        r2009 = ifelse(is.na(`2009`), 0, `2009`),
                                                        r2010 = ifelse(is.na(`2010`), 0, `2010`),
                                                        sexor = dplyr::recode(tmp1, `Male` = 1, `Female` = 0))

# Create summary RAIS variables pre-treatment (only years up to 2010 are included)
W2_attrition_overall$av_pre <- rowSums(W2_attrition_overall[,grep("^r2\\d\\d\\d$",names(W2_attrition_overall))],na.rm=T)/8  
W2_attrition_overall$av_prel  <- log(1+W2_attrition_overall$av_pre)
W2_attrition_overall$formal_pre <- rowSums(W2_attrition_overall[,grep("^r2\\d\\d\\d$",names(W2_attrition_overall))]!=0)
W2_attrition_overall$formal_pre_prop <- rowSums(W2_attrition_overall[,grep("^r2\\d\\d\\d$",names(W2_attrition_overall))]!=0)/8

comment(W2_attrition_overall) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",sep="")
save(W2_attrition_overall, file = paste0(home, "DB/W2_attrition_overall.Rda"))

#Saving public dataset
W2_attrition_overall <- W2_attrition_overall %>% select(id_edital, TT, interviewed, treat_ind, fieldID,   
                                                         sexor, av_pre, av_prel, formal_pre, formal_pre_prop)

save(W2_attrition_overall, file = paste0(home, "DB/W2_attrition_overall_public.Rda"))


##############Among those who picked up (and were our respondents)
#dependent variable: interviewed, did not get interviewed
#unit of analysis is also individual-lottery

#get all attempted
table(bd$he) #no high effort people

#create individual-lottery format

#Removing NA CPFSTD
bd2 <- bd %>% filter(!is.na(cpfstd))
pickedup <- bd %>% filter(!is.na(b2))
pickedup_cpfs <- unique(pickedup$cpfstd) 

### Read the "sorteados" file 
load(paste0(home, "DB/sorteados_october2018.Rda"))
### Read the "inscritos" file 
load(paste0(home, "DB/inscritos_october2018.Rda"))
#Select lotteries inscritos
pu_inscritos <- inscritosf %>% filter(cpfstd %in% pickedup_cpfs)
stopifnot(length(unique(pu_inscritos$cpfstd))==length(pickedup_cpfs)) 
pu_inscritos <- pu_inscritos %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))
stopifnot(length(unique(pu_inscritos$cpfstd))==length(pickedup_cpfs))

table(pu_inscritos$id_edital) #smell check

#Selecting lotteries winners in the attempts
pu_winners <- sorteadosf %>% filter(cpfstd %in% pickedup_cpfs)
pu_winners <- pu_winners %>% filter(id_edital %in% c("edital03.2011", "edital06.2011"))
table(pu_winners$id_edital) #smell check

#Creating TT labels
pu_winners <- pu_winners %>% mutate(TT1 = case_when(id_edital == "edital03.2011" ~ "T032011", 
                                                    id_edital == "edital06.2011" ~ "T062011")) %>% select(cpfstd, names, TT1, id_edital)

pu_inscritos2 <-  left_join(pu_inscritos, pu_winners, by = c("cpfstd", "id_edital"))
nrow(pu_inscritos2) == nrow(pu_inscritos) 
length(unique(pu_inscritos2$cpfstd))

pu_inscritos2 <- pu_inscritos2 %>% mutate(TT = case_when(id_edital =="edital03.2011" & is.na(TT1) ~ "C032011", 
                                                                       id_edital =="edital06.2011" & is.na(TT1) ~ "C062011", 
                                                                       id_edital =="edital03.2011" & !is.na(TT1) ~ "T032011", 
                                                                       id_edital =="edital06.2011" & !is.na(TT1) ~ "T062011"))
table(pu_inscritos2$TT)

#Stacking for individual-lottery format
data_ind_lottery_pickedup <- pu_inscritos2
stopifnot(length(unique(data_ind_lottery_pickedup$cpfstd))==length(pickedup_cpfs)) 


#Adding Rais data
load(paste0(home, "DB/inscritos_october2018.Rda"))

#Joining
W2_attrition <- data_ind_lottery_pickedup %>% left_join(all_wide_rais_all_editais, by = "cpfstd")

stopifnot(nrow(W2_attrition)==nrow(data_ind_lottery_pickedup))
stopifnot(length(unique(W2_attrition$cpfstd))==length(pickedup_cpfs)) 

#operationalize dependent variable (interviewed == 1, refused == 0)

interviewed <- bd %>% filter(b2 == 1 | c1 == 1) 
refused <-   bd %>% filter(b2 == 3 | c1 %in% c(2, 3))

cpfs_interviewed <- unique(interviewed$cpfstd)
cpfs_refused <- unique(refused$cpfstd)

#Creating indicator

W2_attrition <- W2_attrition %>% mutate(interviewed = ifelse(cpfstd %in% cpfs_interviewed, 1, 0),
                                        refused = ifelse(cpfstd  %in% cpfs_refused, 1, 0), 
                                        interviewed_v2 = ifelse(cpfstd %in% cpfs_interviewed, 1, 
                                                          ifelse(cpfstd  %in% cpfs_refused, 0, NA)))

#Smell checks
table(W2_attrition$interviewed)
table(is.na(W2_attrition$interviewed))
table(is.na(W2_attrition$interviewed_v2))

#Treatment indicator
W2_attrition <- W2_attrition %>% mutate(treat_ind = ifelse(TT %in% c("T032011", "T062011"), 1, 0))

#Smell check
table(W2_attrition$edital03.2011, W2_attrition$edital06.2011)


#Getting sex indicator
tmp1 <- get_gender(W2_attrition$names.clean, state = "RJ")
tmp2 <- tibble(tmp1)
W2_attrition <- bind_cols(W2_attrition, tmp2)


#Creating id
# redacted because it reveals private information

#Making data anonymous

W2_attrition <- W2_attrition %>% select(-cpfstd, -names.x, -names.clean, -sum_participation, 
                                        -sum_participation_geral, -sum_participation_territorial, -names.y, -cpf.y,
                                        -sum_wins, -sum_wins_geral, -sum_wins_territorial, -cpf.x, -cpf2.x, -cpf2.y, 
                                         -cpf3)
  
#Recoding gender and income

W2_attrition <- W2_attrition %>% mutate(r2003 = ifelse(is.na(`2003`), 0, `2003`),
                                        r2004 = ifelse(is.na(`2004`), 0, `2004`),
                                        r2005 = ifelse(is.na(`2005`), 0, `2005`),
                                        r2006 = ifelse(is.na(`2006`), 0, `2006`),
                                        r2007 = ifelse(is.na(`2007`), 0, `2007`),
                                        r2008 = ifelse(is.na(`2008`), 0, `2008`),
                                        r2009 = ifelse(is.na(`2009`), 0, `2009`),
                                        r2010 = ifelse(is.na(`2010`), 0, `2010`),
                                        sexor = dplyr::recode(tmp1, `Male` = 1, `Female` = 0))

# Create summary RAIS variables pre-treatment (only years up to 2010 are included)
W2_attrition$av_pre <- rowSums(W2_attrition[,grep("^r2\\d\\d\\d$",names(W2_attrition))], na.rm=T)/8  
W2_attrition$av_prel  <- log(1+W2_attrition$av_pre)
W2_attrition$formal_pre <- rowSums(W2_attrition[,grep("^r2\\d\\d\\d$",names(W2_attrition))]!=0)
W2_attrition$formal_pre_prop <- rowSums(W2_attrition[,grep("^r2\\d\\d\\d$",names(W2_attrition))]!=0)/8

comment(W2_attrition) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",sep="") 
save(W2_attrition, file = paste0(home, "DB/W2_attrition.Rda"))

#Saving public dataset
W2_attrition <- W2_attrition %>% select(id_edital, TT, interviewed, treat_ind, 
                                        fieldID,
                                        sexor, av_pre, av_prel, formal_pre, formal_pre_prop)

save(W2_attrition, file = paste0(home, "DB/W2_attrition_public.Rda"))

### Creating dataset for a comparison of survey and admin data  #####

### Read the "sorteados" file 
load(paste0(home, "DB/sorteados_october2018.Rda")) 
#load("Data/sorteados_october2018.Rda")
### Read the "inscritos" file
load(paste0(home, "DB/inscritos_october2018.Rda")) 
#load("Data/inscritos_october2018.Rda")	
### Read the RAIS dataset
load(paste0(home, "DB/all_wide_rais_october.Rda"))

#Stack 
sorteados_2011 <- sorteadosf %>% filter(id_edital == "edital03.2011" | id_edital == "edital06.2011") %>% mutate(treat_ind = 1)
inscritos_2011_03 <- inscritosf %>% filter(id_edital == "edital03.2011")
inscritos_2011_06 <- inscritosf %>% filter(id_edital == "edital06.2011") 
inscritos_2011_03 <- inscritos_2011_03[!duplicated(inscritos_2011_03[ , c("cpfstd")]),]  #duplicated here from pdf
inscritos_2011_06 <- inscritos_2011_06[!duplicated(inscritos_2011_06[ , c("cpfstd")]),]  #no duplicates here
inscritos_2011 <- bind_rows(inscritos_2011_03, inscritos_2011_06)

data_admin <- inscritos_2011 %>% left_join(sorteados_2011, by = c("cpfstd", "id_edital"))
stopifnot(nrow(data_admin)==623411)

#Merge with Rais
data_admin <- data_admin %>% left_join(all_wide_rais_all_editais, by = "cpfstd")
stopifnot(nrow(data_admin)==623411)

#Recode
# Create summary RAIS variables post-treatment (2013 and 2014)
data_admin$av_post <- rowSums(data_admin[,grep("^rp2\\d\\d\\d$",names(data_admin))],na.rm=T)/2  
data_admin$av_postl  <- log(1+data_admin$av_post)
data_admin$formal_post <- rowSums(data_admin[,grep("^rp2\\d\\d\\d$",names(data_admin))]!=0)
data_admin$formal_post_prop <- rowSums(data_admin[,grep("^rp2\\d\\d\\d$",names(data_admin))]!=0)/2

data_admin <- data_admin %>% mutate(rp2013 = ifelse(is.na(`2013`), 0, `2013`), 
                                    rp2014 = ifelse(is.na(`2014`), 0, `2014`), 
                                    treat_indf = ifelse(is.na(treat_ind), 0, treat_ind)) 

#create id
#redacted because it reveals private information

data_admin <- data_admin %>% select(fieldID, id_edital,  treat_indf,
                                    av_post, av_postl,
                                    formal_post, formal_post_prop)

# eliminate one entry without fieldID
data_admin <- data_admin[-which(is.na(data_admin$fieldID)),]

comment(data_admin) <- paste("Dataset created in ",as.Date(Sys.Date()),".\n",sep="") 
save(data_admin, file = paste0(home, "DB/W2_attrition_admin_overall_public.Rda"))

